Regression
On commence par importer le jeu de données et on vérifie si il y’a des valeurs manquantes, ce qui n’est pas le cas. On peut donc continuer avec l’analyse des données en vérifiant le type des variables:
On va transformer bonus_malus en binaire et retirer les variables qui ne sont pas utiles pour la prédiction comme PoliId.
library(rmarkdown)
library(dplyr)
# importation des données
train <- read.csv("./data/train_set.csv", header = T, sep = ",", dec = ".")
test <- read.csv("./data/test_set.csv", header = T, sep = ",", dec = ".")
# valeurs manquantes
sum(is.na(train))## [1] 0
# On va transformer bonus_malus en binaire
train$Bonus_Malus <- ifelse(train$Bonus_Malus < 100, "Bonus", "Malus")
test$Bonus_Malus <- ifelse(test$Bonus_Malus < 100, "Bonus", "Malus")
train <- train %>%
select(-PolID)
test <- test %>%
select(-PolID)
# appercu des données
paged_table(train)On peut maintenant continuer avec l’analyse des données en vérifiant le type des variables:
library(kableExtra)
variables <- classifier_variables_tab(train)
numeric_variables <- data.frame(variables_numériques = variables$variables_numeriques)
categorical_variables <- data.frame(variables_catégorielles = append(variables$variables_categorielles,
variables$variables_binaires))
# categorical_variables %>%
kable(categorical_variables) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| variables_catégorielles |
|---|
| Car_Model |
| Urban_rural_class |
| French_region |
| Bonus_Malus |
| Car_Fuel |
# numeric_variables %>%
kable(numeric_variables) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| variables_numériques |
|---|
| Claim |
| Period_Exp |
| Car_Power |
| Car_Age |
| Age |
| Inhab_density |
# On va convertir les variables catégorielles en facteur on Obtient alors:
variables <- classifier_variables_tab(train)
numeric_variables <- variables$variables_numeriques
categorical_variables <- append(variables$variables_categorielles, variables$variables_binaires)
# convertir les varianles catégorielles en factor
train[categorical_variables] <- lapply(train[categorical_variables], factor)
test[categorical_variables] <- lapply(test[categorical_variables], factor)
str(train)## 'data.frame': 542389 obs. of 11 variables:
## $ Claim : int 4 5 8 4 11 4 0 0 0 0 ...
## $ Period_Exp : num 0.56 1 0.41 0.27 0.08 0.1 0.96 0.73 0.09 0.73 ...
## $ Car_Power : int 4 7 4 5 4 4 14 10 4 5 ...
## $ Car_Age : int 4 9 12 9 13 1 25 2 12 4 ...
## $ Age : int 46 67 52 23 53 31 49 38 27 32 ...
## $ Bonus_Malus : Factor w/ 2 levels "Bonus","Malus": 1 1 1 1 1 1 1 1 1 1 ...
## $ Car_Model : Factor w/ 11 levels "B1","B10","B11",..: 9 7 1 8 1 4 2 4 10 8 ...
## $ Car_Fuel : Factor w/ 2 levels "Diesel","Regular": 1 1 2 1 2 2 2 1 2 1 ...
## $ Urban_rural_class: Factor w/ 6 levels "A","B","C","D",..: 1 5 4 5 4 5 5 3 3 3 ...
## $ Inhab_density : int 29 4762 824 6924 824 2983 5053 160 229 461 ...
## $ French_region : Factor w/ 22 levels "Alsace","Aquitaine",..: 7 21 13 12 13 17 12 20 6 6 ...
##Étude des variables catégorielles:
0.1 Car Model
0.2 Bonus_Malus
0.3 Urban_rural_class
0.4 Car_Fuel
0.5 French_region
1 Étude des variables numériques
1.1 Inhab_density
plot_numeric <- function(data, variable) {
p1 <- ggplot(data, aes_string(x = variable)) + geom_histogram(aes(y = ..density..),
bins = 30, fill = "lightblue", color = "black") + geom_density(alpha = 0.2,
fill = "#FF6666") + labs(title = paste("Distribution de la variable", variable)) +
theme_bw()
p2 <- ggplot(data, aes_string(x = variable)) + geom_boxplot(fill = "lightblue",
color = "black") + labs(title = paste("Boxplot de la variable", variable)) +
theme_bw()
p3 <- ggplot(train, aes(x = .data[[variable]], y = Claim)) + geom_point(alpha = 0.6,
color = "darkorange") + labs(title = paste("Relation entre", variable, "et nombre de sinistres"),
x = variable, y = "Nombre de sinistres") + theme_minimal()
print(p1)
print(p2)
print(p3)
}
box_plot <- function(data, col) {
data$Claim <- as.factor(data$Claim)
p1 <- ggplot(data, aes(x = Claim, y = .data[[col]], fill = Claim)) + geom_boxplot() +
labs(title = paste("Distribution de", col, " par Claim"), x = "Claim", y = col) +
theme_bw()
# Histogram with 20 bins Histogram
p2 <- ggplot(data, aes(x = .data[[col]], fill = Claim)) + geom_histogram(color = "black",
bins = 20, alpha = 1) + labs(title = paste("Histogramme de", col, "par Claim"),
x = col, y = "Nombre") + theme_bw()
return(p2)
}
plot_numeric(train, "Inhab_density")## [1] 0
2 Analyse de la target
## Analyse des corrélations
Une heatmap pour visualiser les corrélations entre les variables numériques.
library(reshape2)
library(corrplot)
# Distribution des variables numériques
num_vars <- train[, c("Claim", "Period_Exp", "Car_Power", "Car_Age", "Age", "Inhab_density")]
corr_matrix <- cor(num_vars)
melted_cor <- melt(corr_matrix)
ggplot(data = melted_cor, aes(x = Var1, y = Var2, fill = value)) + geom_tile() +
scale_fill_gradient2(low = "red", high = "blue", mid = "white", midpoint = 0) +
labs(title = "Heatmap des corrélations", x = "", y = "")